# load "Ask a A Manager 2021 Survey" googlesheet
# https://www.askamanager.org/
ask_a_manager_2021 <- read_csv(here::here("data","ask_a_manager.csv"))

skimr::skim(ask_a_manager_2021)
Data summary
Name ask_a_manager_2021
Number of rows 26765
Number of columns 18
_______________________
Column type frequency:
character 14
logical 2
numeric 1
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
how_old_are_you 0 1.00 5 10 0 7 0
industry 62 1.00 2 171 0 1084 0
job_title 0 1.00 1 126 0 12838 0
additional_context_on_job_title 19868 0.26 1 781 0 6612 0
currency 0 1.00 3 7 0 11 0
additional_context_on_income 23851 0.11 1 1143 0 2839 0
country 0 1.00 1 209 0 297 0
state 4761 0.82 4 114 0 125 0
city 23 1.00 1 171 0 4070 0
overall_years_of_professional_experience 0 1.00 9 16 0 8 0
years_of_experience_in_field 0 1.00 9 16 0 8 0
highest_level_of_education_completed 202 0.99 3 34 0 6 0
gender 155 0.99 3 29 0 5 0
race 151 0.99 5 125 0 47 0

Variable type: logical

skim_variable n_missing complete_rate mean count
other_monetary_comp 26765 0 NaN :
currency_other 26765 0 NaN :

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
annual_salary 0 1 144988 5488158 0 54000 75712 110000 8.7e+08 ▇▁▁▁▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
timestamp 0 1 2021-04-27 11:02:09 2021-09-14 21:55:44 2021-04-28 12:35:21 23989

Data cleaning

After skimming the dataset, we took the below steps to clean the raw data :

Removing NA values

# remove the NA values and 0 salaries
ask_a_manager_2021 <- ask_a_manager_2021 %>% 
  filter(!gender %in% c("NA","Other or prefer not to answer","Prefer not to answer", "Non-binary", NA) &
           race!="NA" &
           highest_level_of_education_completed!="NA"&
           annual_salary>0)

Removing outliers

# using z-score. remove data out of 3 sigma
# ask_a_manager_2021 <- ask_a_manager_2021[-which(abs(scale(ask_a_manager_2021$annual_salary))>3),]
# using quantile. remove data larger than 75%+1.5*IQR or smaller than 25%-1.5*IQR
# we choose this method recommended by the stats community
Q <- quantile(ask_a_manager_2021$annual_salary, probs=c(.25, .75))
iqr <- IQR(ask_a_manager_2021$annual_salary)
up <-  Q[2]+1.5*iqr # Upper Range
low <-  Q[1]-1.5*iqr # Upper Range
ask_a_manager_2021 <- ask_a_manager_2021 %>%
  filter(annual_salary>=low & annual_salary<=up)

Removing very small age group

We observe there are only 2 samples under 18 years old correctly entered their data, so we remove them to make analysis between age groups possible.

ask_a_manager_2021 %>% 
  group_by(how_old_are_you, gender) %>% 
  summarise(count=n()) %>% 
  ggplot(aes(x=fct_relevel(how_old_are_you,levels=c("under 18","18-24","25-34","35-44","45-54","55-64","65 or over")),y=count))+
  geom_col()+
  labs(title="Sample size in age groups",
       x="Age Group",
       y="number of samples")+
  NULL

ask_a_manager_2021 <- ask_a_manager_2021 %>% 
  filter(how_old_are_you!="under 18")

Transforming into ordered factor datatype

Namely, we want to make age, years of working experience, education into factor, and take the log of salary for the next step of EDA.

ask_a_manager_2021 <- ask_a_manager_2021 %>% 
  mutate(
    country = countrycode::countryname(country),
    age_group = factor(how_old_are_you,levels=c("18-24","25-34","35-44","45-54","55-64","65 or over")),
    field_exp = factor(years_of_experience_in_field,levels=c("1 year or less","2 - 4 years","5-7 years","8 - 10 years","11 - 20 years","21 - 30 years","31 - 40 years","41 years or more")),
    pro_exp = factor(overall_years_of_professional_experience,levels=c("1 year or less","2 - 4 years","5-7 years","8 - 10 years","11 - 20 years","21 - 30 years","31 - 40 years","41 years or more")),
    education = factor(highest_level_of_education_completed, levels=c("High School","Some college","College degree","Master's degree","Professional degree (MD, JD, etc.)", "PhD")),
     higher_edu = ifelse(education %in% c("College degree","Master's degree", "PhD", "Professional degree (MD, JD, etc.)"),1,0),
    salary = annual_salary,
    # take ln() to annual salary
    log_salary = log(annual_salary)
  ) %>% 
  janitor::clean_names() # clean columns names

Remove problematic entries

# there are people under 18 having more than 18 years of experience
ask_a_manager_2021 %>% 
  filter(age_group=="under 18") %>% 
  select(c("age_group","overall_years_of_professional_experience"))
age_groupoverall_years_of_professional_experience
ask_a_manager_2021 <- ask_a_manager_2021 %>% 
  mutate(max_age = case_when(how_old_are_you=="under 18"~18,
                             how_old_are_you=="65 or over"~999,
                             T~as.numeric(substr(how_old_are_you,4,5))),
         # parse the first number in a character
         min_exp = if_else(parse_number(overall_years_of_professional_experience)>
                       parse_number(years_of_experience_in_field),
                       parse_number(overall_years_of_professional_experience),
                       parse_number(years_of_experience_in_field))) %>% 
  # delete the problematic entries and the tow columns created here.
  filter(max_age>=min_exp) %>% 
  select(-c("max_age","min_exp"))

Selecting industries

Since there are over 1000 reported industries in the dataset, we will only keep the top 25 industries which account for 92% of total entries

# Industry is messy... it has > 1000 different industries.
# we only keep the names of top 25 industries. (24597(92%) out of 26765 entries)
industry_list <- ask_a_manager_2021 %>% 
  count(industry, sort=TRUE) %>% 
  mutate(percent = 100* n/sum(n)) %>% 
  slice_max(order_by = n, n = 25) %>% 
  select(industry)

ask_a_manager_2021 <- ask_a_manager_2021 %>% 
  mutate(industry = ifelse(industry %in% industry_list$industry,
                           industry,
                           "Others"))

Unifying currencies

Since there are 11 different currencies reported in the dataset, we want to unify the most common currencies into USD for the convenience of comparison analysis afterwards. We decided to convert all currencies with occurences greater than 100 into USD. This included CAD,GBP,EUR,AUD/NZD. Other was not converted as there wasn’t specific information on the currency.

ask_a_manager_2021 %>%
  count(currency, sort=TRUE) %>%
  mutate(percent = 100* n/sum(n))
currencynpercent
USD2026883.6    
CAD14746.08   
GBP14345.91   
EUR5512.27   
AUD/NZD4321.78   
Other570.235  
CHF290.12   
SEK10.00412

From the above table, We can see that USD, CAD, GBP, EUR, (AUD/NZD) are the most common currencies. We will convert the currencies all to USD using exchange rate data from library quantmod, and the exchange rate will be taken from the last date of the survey “2021-09-14”

#gets the currency rate for the currency pairs. This uses the Oanda which keeps currency data for the last 180 days.
#In 180 days from "2021-09-14" the code will no longer return the currency rates.
currencies <-getSymbols(c("EUR/USD","CAD/USD","GBP/USD","AUD/USD" , "NZD/USD"),
                        from="2021-09-14", to="2021-09-14", src="oanda")

#In order to preserve the analysis for after 180 days we can hard code the currency rates into a tibble
currencies_preserved <- tibble(
  name = c("EUR/USD","CAD/USD","GBP/USD","AUD/USD" , "NZD/USD"),
  value = c(1.18,0.79,1.38,0.734,0.711)
)

Here we create a new column called converted_salary which converts the 5 most common currencies in USD. These are then rounded to the nearest whole number.

#the values are taken from the currencies_preserved$value as to preserve the analysis when 
# getSymbols will no longer work. The index matches the currency pair.
ask_a_manager_2021 <- ask_a_manager_2021 %>% 
  mutate(converted_salary = case_when(
    currency == "GBP" ~ round(annual_salary*currencies_preserved$value[3]),
    currency == "AUD/NZD" & country == "Australia" ~ round(annual_salary*currencies_preserved$value[4]),
    currency == "AUD/NZD" & country == "New Zealand" ~ round(annual_salary*currencies_preserved$value[5]),
    currency == "CAD" ~ round(annual_salary*currencies_preserved$value[2]),
    currency == "EUR" ~ round(annual_salary*currencies_preserved$value[1]),
    TRUE ~ round(annual_salary)
  ))
#Here we filter the dataset to only include the salaries that were converted into USD terms. The other currencies also included lots of outliers which we also get rid of here.

ask_a_manager_2021 <- ask_a_manager_2021 %>% 
  filter(currency %in% c("USD","GBP","AUD/NZD","EUR","CAD") &
           !is.na(country))

ask_a_manager_2021 %>%
  count(currency, sort=TRUE) %>%
  mutate(percent = 100* n/sum(n))
currencynpercent
USD2012884.1 
CAD14646.12
GBP13625.69
EUR5472.29
AUD/NZD4281.79

Exploratory Data Analysis

Counts

# Some quick counts, groups, etc
ask_a_manager_2021 %>% 
  count(age_group, sort=TRUE) %>% 
  mutate(percent = 100* n/sum(n)) %>% 
  ggplot(aes(x=fct_relevel(age_group,levels=c("under 18","18-24","25-34","35-44","45-54","55-64","65 or over")), y=n)) +
  geom_bar(stat="identity") + 
  labs(title = "Age distribution of Ask a Manager survey respondents",
       x="Age group",
       y="Frequency",                                                  
       caption = "Source:Askamanager.com")

# 'country' 
ask_a_manager_2021 %>%
  count(country, sort=TRUE) %>% 
  mutate(percent = 100* n/sum(n))

# 'city' 
ask_a_manager_2021 %>% 
  count(city, sort=TRUE) %>% 
  mutate(percent = 100* n/sum(n))

# education 
ask_a_manager_2021 %>% 
  count(education) %>% 
  mutate(percent = 100* n/sum(n))%>% 
  arrange(desc(percent))

# gender
ask_a_manager_2021 %>% 
  count(gender) %>% 
  mutate(percent = 100* n/sum(n))%>% 
  arrange(desc(percent))

# race
ask_a_manager_2021 %>% 
  count(race) %>% 
  mutate(percent = 100* n/sum(n))%>% 
  arrange(desc(percent))

# pro_exp 
ask_a_manager_2021 %>% 
  count(pro_exp ) %>% 
  mutate(percent = 100* n/sum(n))%>% 
  arrange(desc(percent))

# field_exp  
ask_a_manager_2021 %>% 
  count(field_exp) %>% 
  mutate(percent = 100* n/sum(n))%>% 
  arrange(desc(percent))

#make comments about the counts 

Salary

How is salary distributed?

favstats(ask_a_manager_2021$salary) # find a really rich guy here
minQ1medianQ3maxmeansdnmissing
15.4e+047.5e+041.04e+051.92e+058.18e+043.75e+04239290
# density
g1 <- ggplot(ask_a_manager_2021, aes(x=salary))+
  geom_density()+
  NULL
#cdf
g2 <- ask_a_manager_2021 %>% 
  slice_min(order_by = salary,n =nrow(ask_a_manager_2021)-5) %>% # choose a number
  ggplot(aes(x=salary))+
  stat_ecdf()+
  NULL

# taking log is better
g3 <- ggplot(ask_a_manager_2021, aes(x=log(salary)))+
  geom_density()+
  NULL

g4 <- ask_a_manager_2021 %>% 
  slice_min(order_by = salary,n =nrow(ask_a_manager_2021)-5) %>% # choose a number
  ggplot(aes(x=log(salary)))+
  stat_ecdf()+
  NULL

gridExtra::grid.arrange(g1,g2,g3,g4)

Observation: This is a right skewed distribution.

Industry

ask_a_manager_2021 %>% 
  count(industry, sort=TRUE) %>% 
  ggplot(aes(y=fct_reorder(industry,n), x=n))+
  geom_col()

Observations:

  • Computing & tech is the top-chosen industry among respondents, followed by Education and Non-profit
  • This indicate that the sample might include more respondents from industries have higher exposure to tech and internet.

Salary ,Gender and Industry

We are interested in the type of industries with top salary for man and woman.

# Industries of top average salary for Man and Woman
salary_table <- ask_a_manager_2021 %>% 
  group_by(industry, gender) %>% 
  summarise(count = n(),
            avg_salary = mean(converted_salary),
            higher_edu_prop = sum(higher_edu)/n()) %>% 
  pivot_wider(id_cols = 1:5,
              names_from = gender,
              values_from = count:higher_edu_prop)
  
g_man <- salary_table %>% 
  ggplot(aes(y=fct_reorder(industry,avg_salary_Man), x=avg_salary_Man,fill=higher_edu_prop_Man))+
  geom_bar(position="dodge", stat="identity")+
  geom_text(aes(label=paste(avg_salary_Man%/%1000,"k")),
            position=position_dodge(width=0.9), hjust=0,vjust=0)+
  scale_x_continuous(labels = scales::comma) +
  scale_fill_gradient(low="sky blue", high="blue")+
  labs(
    title="Man",
    x="Annual Salary $",
    y="Industry",
    fill="Higher Education %"
  )+
  theme_wsj()+
  theme(legend.position="bottom",
        legend.title=element_text(size=13))+
  NULL

ggsave("highest_paid_industries_male.png",width = 12,height = 8,limitsize = F)
g_woman <- salary_table %>% 
  ggplot(aes(y=fct_reorder(industry,avg_salary_Woman), x=avg_salary_Woman,fill=higher_edu_prop_Woman))+
  geom_bar( position="dodge", stat="identity")+
  geom_text(aes(label=paste(avg_salary_Woman%/%1000,"k")),
              position=position_dodge(width=0.9), hjust=0,vjust=0)+
  scale_x_continuous(labels = scales::comma) +
  scale_fill_gradient(low="pink", high="red")+
  labs(
    title="Woman",
    x="Annual Salary $",
    y="Industry",
    fill="Higher Education %"
  )+
  theme_wsj()+
  theme(legend.position="bottom",
        legend.title=element_text(size=13))+
  NULL


ggsave("highest_paid_industries_female.png",width = 12,height = 8,limitsize = F)

g_man

g_woman

Observations:

  • For both man and woman, the top earning industries are alike (Law, Computing or Tech, Business or Consulting)
  • Respondents from Law, Govenment and Education tends to have higher education level compared to the rest, reflecting the academic/professional requirements of these positions
  • The difference between the salary within sales and accounting, banking & finance is likely due to the subdivisions chosen based on gender nature

We are also interested in the salary difference between man and woman within the same industry.

# salary gap in different industries
ask_a_manager_2021 %>% 
  group_by(industry, gender) %>% 
  summarise(count = n(),
            avg_salary = mean(converted_salary)) %>% 
  pivot_wider(id_cols = 1:4,
              names_from = gender,
              values_from = count:avg_salary) %>% 
  mutate(size = count_Man+count_Woman,
         male_prop = count_Man/(count_Man+count_Woman),
         salary_gap = avg_salary_Man - avg_salary_Woman) %>% 
  ggplot(aes(y=fct_reorder(industry,salary_gap), x=salary_gap,fill=male_prop))+
  geom_col()+
  
  # geom_point(aes(y=fct_reorder(industry,size),x=size*10))+
  
  scale_x_continuous(
    breaks = seq(-20000,40000,5000)
  ) +
  scale_fill_gradient(low="pink", high="blue")+
  labs(
    title="Salary Gap and Sex Ratio",
    x="Salary Gap in $",
    y="",
    fill="Male %"
  )+
  theme_bw()+
  theme(legend.position="bottom",
        legend.title=element_text(size=8))+
  NULL

ggsave("salary_gap.png",width = 12,height = 8,limitsize = F)

Observations:

  • Man has a higher average salary than woman in most of the industries, with Law being the most significant
  • Woman has higher average salary in Art & Design
  • The proportion of male within computing or tech is significantly higher than that within other industries; yet the proportion only reaches c. 40%, reflecting the fact that most respondents are female because Ask_a_manager is more popular among women
  • The proportion of male is also higher in labour-intensive industries such as transport or logistics, engineering, utilities & telecommunications

Education

Next, we want to see if there is a difference in the salary gap between man and woman for different education level.

ask_a_manager_2021 %>% 
  group_by(education, gender) %>% 
  summarise(count = n(),
            avg_salary = mean(converted_salary)) %>% 
  pivot_wider(id_cols = 1:4,
              names_from = gender,
              values_from = count:avg_salary) %>% 
  mutate(size = count_Man+count_Woman,
         male_prop = count_Man/(count_Man+count_Woman),
         salary_gap = avg_salary_Man - avg_salary_Woman) %>% 
  ggplot(aes(x=education, y=salary_gap,fill=male_prop))+
  geom_col()+
  scale_y_continuous(
    breaks = seq(-20000,40000,5000)
  ) +
  scale_fill_gradient(low="pink", high="sky blue")+
  theme_bw()+
  theme(legend.position="bottom", axis.title=element_text(size=8))+
  ggtitle("Salary Gap at Different Education level") +
  xlab("Education Level") + ylab ("Salary Gap in $") + 
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
  NULL

ggsave("salary_gap_by_edu.png",width = 14,height = 8,limitsize = F)
ask_a_manager_2021 %>% 
  filter(gender %in% c("Man", "Woman")) %>% 
  mutate(sex = factor (gender, labels = c("Man", "Woman"))) %>% 
  rename(Education = education) %>% 
  group_by(Education, gender) %>% 
  summarise (median_salary_by_edu = median (converted_salary),
             mean_salary_by_edu = mean(converted_salary), ) %>% 
  ggplot () +
  geom_col(aes (Education, median_salary_by_edu, fill = gender), position = position_dodge(width = 0.9), alpha = 0.7)+
#Removed the section which broke the code
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
  NULL

Race

Count

ask_a_manager_2021 %>% 
  group_by(race) %>% 
  filter(n() > 500) %>% 
  summarise(count=n()) %>% 
  ggplot(aes(y=count, x=race, fill=race))+
  geom_col() +
  theme_wsj()+
  theme(legend.position = "none") +
  xlab("Race") + ylab ("Count") + 
  labs(title="Count by Race", subtitle = "Races with > 500 Respondents")+
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) + 
  NULL

Race and Mean Salary

ask_a_manager_2021 %>% 
  filter(industry %in% industry_list$industry) %>% 
  group_by(race) %>% 
  filter(n() > 500) %>% 
 ggplot( aes(x=race, y=converted_salary, color=race)) +
    geom_jitter(color="black", size=0.01, alpha=0.9) +
   geom_boxplot(alpha=0.4) +
   scale_fill_viridis(discrete = TRUE, alpha=0.6) +
    geom_hline(yintercept=144988, color="red") +
    theme(
      legend.position="none",
      plot.title = element_text(size=11)
    ) +
    theme_wsj() +
    theme(legend.position = "none")+
    theme(axis.title=element_text(size=10))+
    ggtitle("Salary Vs. Race Boxplot") +
    ylab("Salary") + xlab ("Race") +
    scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
    scale_y_continuous(labels = scales::comma, limits = c(0, 300000)) +
    NULL

Observation: In our boxplot one can visualize both the distribution of the data and the medium and interquartile ranges of the data. When comparing annual salary by race, our data shows that Asians have the higher salaries in comparison to Black and African American and White sample respondents. Interestingly enough, our sample also shows that the medium salary for Black and AA respondents is higher than that of White respondents.

Since most repondents were white, the larger sample size is less prone to skewness due to outliers when comparing it to the other two races portrayed in the plot.

ask_a_manager_2021_race <- ask_a_manager_2021 %>%
  group_by(race) %>% 
  filter(race %in% c("Black or African American", "White")) 

t.test(salary ~ race, data = ask_a_manager_2021_race)
## 
##  Welch Two Sample t-test
## 
## data:  salary by race
## t = 0.6, df = 610, p-value = 0.5
## alternative hypothesis: true difference in means between group Black or African American and group White is not equal to 0
## 95 percent confidence interval:
##  -2140  4021
## sample estimates:
## mean in group Black or African American                     mean in group White 
##                                   81976                                   81035

Observation:

When running a T-test on the two racial groups, White and Black or African American, the P-value we get is 0.9. Thus there is no statistically signifcant difference in their annual salary when calculating it based of off this data set.

Location

#create df of the list of states with >10 occurrences 
#This allows us to get data for actual states as many data entry errors
#many individuals put multiple states and this helps to fix that
states <- ask_a_manager_2021 %>% 
  group_by(state) %>%
  filter(!is.na(state)) %>%
  filter(n()>10) %>% 
  summarise(mean_salary = mean(salary),
            count_state = count(state))

states$fips <- fips(states$state)
statesalarymap <- states %>% select(state,mean_salary)

plot_usmap(data=statesalarymap, values="mean_salary", color="black") + scale_fill_continuous(
    low = "white",
    high = "light green",
    name = "Mean Salary") +
  theme(legend.position = "right") +
  labs(title="Map of US States based on Average Salary")

# Count by Industry

ask_a_manager_2021 %>% 
  group_by(industry) %>% 
  summarise(count=n()) %>% 
  ggplot(aes(y=fct_reorder(industry,count),x=count))+
  geom_col()
# Count by Age Group
ask_a_manager_2021 %>% 
  group_by(age_group) %>% 
  summarise(count=n()) %>% 
  ggplot(aes(y=fct_reorder(age_group,count),x=count))+
  geom_col()
# Sorting out the people who are 18+ 
ask_a_manager_age <- ask_a_manager_2021 %>%  
                    mutate(factor(age_group,levels=c("18-24","25-34","35-44","45-54","55-64","65 or over")))
#there are only 6 women in under 18 category so excluding them for the graph since there is no male data in that category to compare
ask_a_manager_age %>% 
  filter(age_group %in% c("18-24", "25-34" , "35-44" , "45-54" , "55-64"  )) %>% 
  group_by(age_group, gender) %>% 
  summarise (median_salary_by_age = median (salary),
             mean_salary_by_age = mean(salary)) %>% 
  ggplot () +
  geom_col(aes (age_group, median_salary_by_age, fill = gender), position = position_dodge(width = 0.9), alpha = 0.7)+
  geom_point(aes(x = age_group, y=mean_salary_by_age, color = gender))+
  theme_bw()+
  #formatC(x, format = "d")+
  labs(title =  "Median Salary by Age Group" , y="Salary" ,x = "Age Group", subtitle = "Points represent mean salary") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
  NULL

Changed Industry? vs salary

ask_a_manager_2021_changed <- ask_a_manager_2021  %>%
mutate(if_changed= if_else(overall_years_of_professional_experience == years_of_experience_in_field, "No", "Yes"))
ask_a_manager_2021_changed %>% 
ggplot(aes(x=fct_relevel(overall_years_of_professional_experience,levels=c("1 year or less","2 - 4 years","5-7 years","8 - 10 years","11 - 20 years","21 - 30 years","31 - 40 years","41 years or more")), y=annual_salary, color=if_changed))+
  # scatter plot
  geom_point(alpha = 0.5)+
  # linear smooth line
  geom_smooth(method = "lm")+
  # legend settings
  theme(legend.position = "bottom",
        legend.title = element_blank())+
  labs(title="Whether changing job affects your salary",
       x="Total Years Spent Working",
       y="Salary")+
  theme_bw()+
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
  NULL

ask_a_manager_2021_changed %>% 
  group_by(overall_years_of_professional_experience, if_changed) %>% 
  summarise (median_salary_over_time = median (salary),
             mean_salary_over_time = mean(salary)) %>% 
  ggplot () +
  geom_col(aes (fct_relevel(overall_years_of_professional_experience,levels=c("1 year or less","2 - 4 years","5-7 years","8 - 10 years","11 - 20 years","21 - 30 years","31 - 40 years","41 years or more")), median_salary_over_time, fill=if_changed), position = position_dodge(width = 0.9), alpha = 0.7)+
  geom_point(aes(x = overall_years_of_professional_experience, y=mean_salary_over_time, color = if_changed))+
  theme_bw()+
   scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
  #formatC(x, format = "d")+
  labs(  title =  "Median Salary over time" , y="Salary" ,x = "Prof", subtitle = "Points represent the mean") +
  NULL

ggsave("switch_industry.png",width = 12,height = 8,limitsize = F)

Observation:

  • The greater the amount of experience the greater salary difference between those who switch and those who don’t.

  • The percentage increase in salary with increasing professional experience is proportional.

  • The outliers for the data were predominantly those who switched industry.

Statistical Tests

Gender vs Salary

General Statistics & Boxplot

mosaic::favstats (converted_salary ~ gender, data=ask_a_manager_2021)
genderminQ1medianQ3maxmeansdnmissing
Man16.3e+04 9e+04       1.28e+052.62e+059.67e+044.3e+04 43030
Woman355.25e+047.11e+049.7e+04 2.55e+057.82e+043.48e+04196260
ask_a_manager_2021 %>% 
  ggplot( aes(x=gender, y=converted_salary, color=gender)) +
    geom_jitter(color="black", size=0.4, alpha=0.9) +
    geom_boxplot(alpha=0.3) +
    scale_fill_viridis(discrete = TRUE, alpha=0.6) +
    theme_wsj() +
    theme(
      legend.position="none",
      plot.title = element_text(size=11)
    ) +
   theme(
      legend.position="none",
      plot.title = element_text(size=11)
    ) +
    theme(axis.title=element_text(size=12))+
    ggtitle("Salary Vs. Gender Boxplot with Jitter") +
    ylab("Salary") + xlab ("Race") + 
    scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
    scale_y_continuous(labels = scales::comma, limits = c(0, 300000)) +
    NULL

For our following analysis, we will be comparing data from the male gender “Man” with data from the female gender “Woman.” We will filter out the data on those that are non binary.

T-Test

ask_a_manager_2021 <- ask_a_manager_2021 %>%
  filter(gender!="Non-binary")
t.test(converted_salary ~ gender, data = ask_a_manager_2021)
## 
##  Welch Two Sample t-test
## 
## data:  converted_salary by gender
## t = 26, df = 5602, p-value <2e-16
## alternative hypothesis: true difference in means between group Man and group Woman is not equal to 0
## 95 percent confidence interval:
##  17124 19871
## sample estimates:
##   mean in group Man mean in group Woman 
##               96721               78223

Observation: This T.test proves there is a statistically significant difference in salary between man and woman.

Confidence Intervals

#mosaic::favstats (annual_salary ~ gender, data=ask_a_manager_2021)
ask_a_manager_2021_gender <- ask_a_manager_2021 %>%
  mutate(gender2 = case_when(
  gender  == "Man" ~ "1",
 gender == "Woman" ~ "0",
  TRUE ~ "NA"
)) %>%
  mutate(gender2 = as.numeric(gender2))


mosaic::favstats (converted_salary ~ gender, data=ask_a_manager_2021_gender) %>%
mutate(t_critical = qt(0.975,n-1),
         sd_mean = sd/sqrt(n),
         margin_of_error = t_critical*sd_mean,
         ci_lower = mean-margin_of_error,
         ci_higher = mean+margin_of_error)
genderminQ1medianQ3maxmeansdnmissingt_criticalsd_meanmargin_of_errorci_lowerci_higher
Man16.3e+04 9e+04       1.28e+052.62e+059.67e+044.3e+04 430301.966551.28e+039.54e+049.8e+04 
Woman355.25e+047.11e+049.7e+04 2.55e+057.82e+043.48e+041962601.96248487       7.77e+047.87e+04

Observation: Since two 95% confidence intervals do not overlap, we are at least 95% confident that the male have higher salary than the female on average.

Bootstrap Test

# hypothesis testing using infer package
diff <- ask_a_manager_2021_gender %>%
  specify(converted_salary ~ gender) %>%
  calculate(stat = "diff in means", order = c("Woman", "Man"))

set.seed(1234)
salary_in_null_world <- ask_a_manager_2021_gender %>%
  filter(gender !="Non-binary") %>%
  specify(converted_salary ~ gender) %>%
   # assuming independence of gender and salary
  hypothesize(null = "independence") %>%
   # generate 1000 reps, of type "permute"
  generate(reps = 1000, type = "permute") %>%
  # calculate statistic of difference, namely "diff in means"
   calculate(stat = "diff in means", order = c("Woman", "Man")) 

salary_in_null_world %>% 
  visualise()+
  shade_p_value(obs_stat = diff, direction = "two-sided")

p_value <- salary_in_null_world %>% 
  get_pvalue(obs_stat = diff, direction="both")
p_value

p_value
0
Observation: Our null hypothesis is that the difference of annual salaries between male and female is 0 and we get a p_value very close to 0. Therefore we reject the null hypothesis with 95% confidence. There is difference between the average male and female salary.

Gender vs Education

We want to look at the relationship between one’s gender and education degree. In essence, we want to explore if there is a difference in the proportion of respondents with high school degree as their highest degree for male group and the female group. We can use the proportion test to generate the result from the male and female sample.

ask_a_manager_2021_education <- ask_a_manager_2021 %>% 
   filter(gender !="Non-binary") %>%
   mutate(below_college = if_else(highest_level_of_education_completed %in% c("High School", "Some college"), "Yes", "No")) %>% 
   group_by(gender) %>% 
   summarise(college_below=sum(below_college == "Yes"), college_or_above = sum(below_college == "No"))

ask_a_manager_2021_education
gendercollege_belowcollege_or_above
Man6353668
Woman157918047
prop.test(x = c(3668,18047), n = c(3668+635,18047+1579), alternative = "two.sided")
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c out of c3668 out of 3668 + 63518047 out of 18047 + 1579
## X-squared = 189, df = 1, p-value <2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.0785 -0.0557
## sample estimates:
## prop 1 prop 2 
##  0.852  0.920

From the result of proportion test for male group and female group, we are 99% confident that the proportion of high school as highest education level for male is higher than that for female, indicating that overall female has a higher college-entry rate vs. the male.

If we repeat the process to explore the difference in proportion in acquiring a masters or above degree for both gender:

ask_a_manager_2021_education <- ask_a_manager_2021 %>% 
  filter(gender !="Non-binary") %>%
  mutate(higher_edu = if_else(highest_level_of_education_completed %in% c("Master's degree", "Professional degree (MD, JD, etc.)", "PhD"), "Yes", "No")) %>% 
  group_by(gender) %>% 
  summarise(master_below = sum(higher_edu == "No"), master_or_above=sum(higher_edu == "Yes") )


ask_a_manager_2021_education
gendermaster_belowmaster_or_above
Man28111492
Woman110688558
prop.test(x = c(1492, 8558), n = c(1492+2811, 8558+11068), alternative = "two.sided")
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c out of c1492 out of 1492 + 28118558 out of 8558 + 11068
## X-squared = 115, df = 1, p-value <2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.1053 -0.0734
## sample estimates:
## prop 1 prop 2 
##  0.347  0.436

From the result of two proportion tests, we can see that even though the proportion of acquiring a college-or-above degree is less in the male group, the proportion of acquiring higher education degree (Master’s-or-above) is higher than the female group. This somehow explains the fact that there are more outliers in male sample with extremely high income level.

Race vs Salary

Here we want to see if race is affecting the salary.

ask_a_manager_salary <- ask_a_manager_2021 %>% 
  mutate(if_white = if_else(race == "White", "Yes", "No")) %>% 
  select(if_white,salary)
t.test(salary~if_white, data = ask_a_manager_salary, success = "Yes")
## 
##  Welch Two Sample t-test
## 
## data:  salary by if_white
## t = 7, df = 4948, p-value = 4e-12
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  3474 6195
## sample estimates:
##  mean in group No mean in group Yes 
##             85869             81035

Race vs Education

We want to look at the relationship between one’s race and education degree. Particularly, we want to see if there is a difference in education level for those being white and those being non-white.

As the above analysis for gender vs. education, we run two proportion tests for college-or-above degree percentage and master-or-above percentage between the male and female group.

ask_a_manager_2021_race <- ask_a_manager_2021 %>% 
  mutate(if_white = if_else(race == "White", "White", "Non-white"),
         below_college = if_else(highest_level_of_education_completed %in% c( "High School", "Some college"), "Yes", "No")) %>% 
  group_by(if_white) %>% 
  summarise(college_below=sum(below_college == "Yes"), college_or_above = sum(below_college == "No"))

ask_a_manager_2021_race
if_whitecollege_belowcollege_or_above
Non-white3343333
White188018382
prop.test(x = c(3333,18382), n = c(3333+334, 18382+1880), alternative = "two.sided")
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c out of c3333 out of 3333 + 33418382 out of 18382 + 1880
## X-squared = 0.09, df = 1, p-value = 0.8
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.00859  0.01200
## sample estimates:
## prop 1 prop 2 
##  0.909  0.907

Similarly, if we run the test for the propotion of master_or_above education level between white and non-white races:

ask_a_manager_2021_race <- ask_a_manager_2021 %>% 
  mutate(if_white = if_else(race == "White", "White", "Non-white"),
         higher_edu = if_else(highest_level_of_education_completed %in% c("Master's degree", "Professional degree (MD, JD, etc.)", "PhD"), "Yes", "No")) %>% 
  group_by(if_white) %>% 
  summarise(master_below = sum(higher_edu == "No"), master_or_above=sum(higher_edu == "Yes") )

ask_a_manager_2021_race
if_whitemaster_belowmaster_or_above
Non-white22561411
White116238639
prop.test(x = c(1411, 8639), n = c(1411+2256, 8629+11623), alternative = "two.sided")
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c out of c1411 out of 1411 + 22568639 out of 8629 + 11623
## X-squared = 22, df = 1, p-value = 3e-06
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.0591 -0.0245
## sample estimates:
## prop 1 prop 2 
##  0.385  0.427

Observation: From the result of the proportion test, we cannot see a clear difference between the proportion of education level in white and non-white group, as the p-value is higher than 5%.

However, the proportion of acquiring higher education degree (Master’s-or-above) is higher in the white group vs. other races

Changed Industry? vs salary T.Test

t.test(salary~if_changed, data = ask_a_manager_2021_changed)
## 
##  Welch Two Sample t-test
## 
## data:  salary by if_changed
## t = 19, df = 23826, p-value <2e-16
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  7949 9827
## sample estimates:
##  mean in group No mean in group Yes 
##             85884             76996
ask_a_manager_age %>% 
  filter(age_group %in% c("18-24", "25-34" , "35-44" , "45-54" , "55-64"  )) %>% 
  group_by(age_group, gender) %>% 
  summarise (median_salary_by_age = median (salary),
             mean_salary_by_age = mean(salary)) %>% 
  ggplot () +
  geom_col(aes (age_group, median_salary_by_age, fill = gender), position = position_dodge(width = 0.9), alpha = 0.7)+
  geom_point(aes(x = age_group, y=mean_salary_by_age, color = gender))+
  theme_bw()+
  #formatC(x, format = "d")+
  labs(  title =  "Median Salary by Age Group" , y="Salary" ,x = "Age Group") +
  NULL

Observation:

Changing industry has a statistically significant affect in average salary in our respondent sample. Those who change salary earn less, in general.

Regression Analysis

# load population data of US
population_url <- "https://www.ers.usda.gov/webdocs/DataFiles/48747/PopulationEstimates.csv?v=2232"
population <- vroom(population_url) %>% 
  janitor::clean_names() %>% 
  
  # select the latest data, namely 2019
  select(fips = fip_stxt, state= area_name, pop_estimate_2019) %>% 
  group_by(state) %>% 
  summarise(pop_estimate_2019_m = sum(pop_estimate_2019)/1000000)

reg_data <- ask_a_manager_2021 %>% 
  filter(!is.na(state)&industry %in% industry_list$industry[1:15]) %>%
  group_by(state) %>%
  filter(n()>10) %>% 
  ungroup() %>% 
  filter(country =="US"&currency=="USD") %>% 
  mutate(job = tolower(job_title),
         research=grepl("research", job),
         manager=grepl("manager", job),
         engineer=grepl("engineer", job),
         analyst=grepl("analyst", job),
         data_related = grepl("data", job),
         scientist = grepl("scientist", job),
         database = grepl("database", job),
         librarian = grepl("librarian", job),
         add_context_job = !is.na(additional_context_on_job_title),
         add_context_inc = !is.na(additional_context_on_income),
         race = case_when(race=="White"~"White",
                           race=="Asian or Asian American"~"Asian",
                           race=="Black or African American"~"Black",
                           T~"Others"),
         ) %>% 
  left_join(population,by = c("state")) %>% 
  select(c("age_group","gender","education","field_exp","pro_exp","race",
           "salary","research","manager","engineer","industry","analyst",
           "data_related","scientist","database","librarian","add_context_job",
           "add_context_inc","pop_estimate_2019_m"))
set.seed(1234)
data_split <- initial_split(reg_data, prop = .7)
data_train <- training(data_split)
data_test  <- testing(data_split)

m1 <- lm(salary ~ age_group, data=data_train)
m2 <- lm(salary ~ pro_exp, data=data_train)
m3 <- lm(salary ~ field_exp, data=data_train)

huxreg(m1, m2, m3,
       statistics = c('#observations' = 'nobs', 
                      'R squared' = 'r.squared', 
                      'Adj. R Squared' = 'adj.r.squared', 
                      'Residual SE' = 'sigma',
                      "AIC" = "AIC"), 
       bold_signif = 0.05, 
       stars = NULL
) %>% 
  set_caption('Comparison of models (age or experience?)')
Comparison of models (age or experience?)
(1)(2)(3)
(Intercept)61635.571 69671.994 65433.717 
(1832.014)(2855.294)(1625.252)
age_group25-3420407.709           
(1904.286)          
age_group35-4430716.137           
(1921.989)          
age_group45-5433106.375           
(2096.570)          
age_group55-6422772.191           
(2587.712)          
age_group65 or over30382.329           
(5556.773)          
pro_exp2 - 4 years     1345.220      
     (3057.459)     
pro_exp5-7 years     8494.468      
     (2973.095)     
pro_exp8 - 10 years     16057.951      
     (2961.636)     
pro_exp11 - 20 years     22547.278      
     (2914.158)     
pro_exp21 - 30 years     27128.221      
     (3013.121)     
pro_exp31 - 40 years     21967.309      
     (3475.103)     
pro_exp41 years or more     18277.526      
     (5948.339)     
field_exp2 - 4 years          6838.122 
          (1781.163)
field_exp5-7 years          17214.794 
          (1763.885)
field_exp8 - 10 years          25866.721 
          (1810.061)
field_exp11 - 20 years          32427.263 
          (1767.441)
field_exp21 - 30 years          40462.832 
          (2095.928)
field_exp31 - 40 years          32929.075 
          (3288.613)
field_exp41 years or more          41222.125 
          (8428.581)
#observations11366     11366     11366     
R squared0.036 0.047 0.090 
Adj. R Squared0.036 0.046 0.089 
Residual SE37095.450 36898.532 36049.843 
AIC271432.355 271313.361 270784.405 
# add more variables to explore and improve accuracy
m4 <- lm(salary ~ field_exp+gender, data=data_train)
m5 <- lm(salary ~ field_exp+gender+gender*field_exp, data=data_train)
m6 <- lm(salary ~ field_exp+gender+education, data=data_train)
m7 <- lm(salary ~ field_exp+gender+gender*field_exp+education+race, data=data_train)
m8 <- lm(salary ~ field_exp+gender+gender*field_exp+education+race+pop_estimate_2019_m, data=data_train)
m9 <- lm(salary ~ field_exp+gender+gender*field_exp+education+race+pop_estimate_2019_m++research+manager+engineer+industry+data_related+scientist+database+librarian, data=data_train)
m10 <- lm(salary ~ field_exp+gender+gender*field_exp+education+race+pop_estimate_2019_m++research+manager+engineer+industry+data_related+scientist+database+librarian+add_context_job+add_context_inc, data=data_train)
m11 <- lm(salary ~ field_exp+gender+gender*field_exp+education+race+pop_estimate_2019_m++research+manager+engineer+industry+data_related+scientist+database+librarian+add_context_job+add_context_inc+data_related*scientist, data=data_train)

huxreg(m4, m5, m6, m7, m8, m9,m10,m11,
       statistics = c('#observations' = 'nobs', 
                      'R squared' = 'r.squared', 
                      'Adj. R Squared' = 'adj.r.squared', 
                      'Residual SE' = 'sigma',
                      "AIC" = "AIC"), 
       bold_signif = 0.05
) %>% 
  set_caption('Comparison of models (adding variables)')
Comparison of models (adding variables)
(1)(2)(3)(4)(5)(6)(7)(8)
(Intercept)82483.049 ***69628.934 ***61442.370 ***68574.888 ***57853.358 ***42915.253 ***43368.902 ***43282.577 ***
(1765.664)   (4507.873)   (3166.399)   (5296.351)   (5263.800)   (4598.579)   (4594.684)   (4594.386)   
field_exp2 - 4 years6196.971 ***11580.886 *  5982.387 ***13174.453 ** 13505.674 ** 10899.235 ** 10947.596 ** 11045.310 ** 
(1743.832)   (4852.730)   (1703.013)   (4688.376)   (4627.460)   (3968.051)   (3963.290)   (3963.175)   
field_exp5-7 years16377.048 ***27928.293 ***16152.069 ***29079.987 ***29292.137 ***26469.305 ***26458.740 ***26543.246 ***
(1727.091)   (4796.247)   (1688.233)   (4633.690)   (4573.462)   (3922.239)   (3917.520)   (3917.328)   
field_exp8 - 10 years25199.227 ***38864.445 ***24544.398 ***40456.388 ***40503.349 ***37719.869 ***37833.908 ***37957.127 ***
(1772.137)   (4913.937)   (1733.477)   (4748.284)   (4686.551)   (4021.405)   (4016.630)   (4016.700)   
field_exp11 - 20 years31141.420 ***47860.450 ***31052.911 ***49988.943 ***50798.309 ***46225.827 ***46395.762 ***46503.502 ***
(1731.126)   (4768.854)   (1694.291)   (4608.543)   (4548.865)   (3905.641)   (3901.098)   (3901.069)   
field_exp21 - 30 years37525.013 ***60021.389 ***38730.943 ***64272.212 ***64767.293 ***59023.009 ***59130.650 ***59252.556 ***
(2055.960)   (5137.828)   (2011.519)   (4966.416)   (4901.929)   (4208.708)   (4203.698)   (4203.712)   
field_exp31 - 40 years30935.521 ***46038.954 ***32456.827 ***49881.389 ***51156.019 ***43022.829 ***43078.827 ***43222.879 ***
(3220.494)   (7399.567)   (3146.731)   (7153.386)   (7060.764)   (6079.513)   (6072.245)   (6072.030)   
field_exp41 years or more35440.467 ***43913.566 ***37627.381 ***50078.983 ***48625.778 ***41841.212 ***41905.812 ***42027.137 ***
(8254.891)   (13238.881)   (8059.436)   (12791.714)   (12625.683)   (10823.046)   (10810.091)   (10809.049)   
genderWoman-19462.346 ***-4788.972    -21123.723 ***-5613.328    -5308.446    3823.444    3746.403    3870.918    
(874.130)   (4816.323)   (858.022)   (4652.743)   (4592.285)   (3939.171)   (3934.651)   (3934.753)   
field_exp2 - 4 years:genderWoman        -5812.696            -7175.929    -7008.995    -5215.950    -5146.048    -5235.200    
        (5198.644)           (5022.080)   (4956.796)   (4249.195)   (4244.143)   (4243.921)   
field_exp5-7 years:genderWoman        -13109.252 *          -13194.269 ** -13241.990 ** -10388.510 *  -10230.593 *  -10314.224 *  
        (5139.849)           (4965.449)   (4900.892)   (4200.445)   (4195.502)   (4195.256)   
field_exp8 - 10 years:genderWoman        -15637.001 **         -16538.217 ** -16070.142 ** -12689.203 ** -12650.008 ** -12762.324 ** 
        (5267.091)           (5088.890)   (5022.800)   (4306.057)   (4300.910)   (4300.829)   
field_exp11 - 20 years:genderWoman        -19445.181 ***        -19952.011 ***-20466.473 ***-13887.439 ***-13903.459 ***-14009.758 ***
        (5117.237)           (4945.171)   (4880.967)   (4186.693)   (4181.661)   (4181.561)   
field_exp21 - 30 years:genderWoman        -27971.809 ***        -29147.813 ***-29545.621 ***-21768.894 ***-21679.884 ***-21795.637 ***
        (5619.375)           (5429.326)   (5358.786)   (4596.536)   (4591.046)   (4590.930)   
field_exp31 - 40 years:genderWoman        -17581.031 *          -17503.351 *  -17950.723 *  -7648.653    -7433.630    -7553.913    
        (8228.457)           (7949.878)   (7846.561)   (6748.574)   (6740.577)   (6740.108)   
field_exp41 years or more:genderWoman        -7106.165            -10096.647    -11137.299    337.307    -396.326    -543.267    
        (17053.820)           (16478.297)   (16264.168)   (13939.868)   (13924.525)   (13923.153)   
educationSome college                6664.759 *  6636.025 *  6334.711 *  5116.750 *  5004.721 *  4982.621 *  
                (2996.167)   (2959.619)   (2921.192)   (2503.433)   (2500.511)   (2500.253)   
educationCollege degree                20794.145 ***20287.413 ***19796.872 ***18576.157 ***18436.607 ***18425.183 ***
                (2747.005)   (2714.021)   (2678.884)   (2303.248)   (2300.621)   (2300.366)   
educationMaster's degree                23589.736 ***23335.704 ***23333.386 ***28704.406 ***28548.266 ***28515.267 ***
                (2765.094)   (2731.568)   (2696.054)   (2327.655)   (2325.034)   (2324.834)   
educationProfessional degree (MD, JD, etc.)                45943.690 ***45707.334 ***45647.748 ***52250.081 ***52182.832 ***52132.988 ***
                (3081.052)   (3043.512)   (3003.945)   (2692.734)   (2689.748)   (2689.570)   
educationPhD                36354.897 ***36615.006 ***36387.719 ***45722.324 ***45422.768 ***45417.812 ***
                (3081.974)   (3044.832)   (3005.274)   (2632.831)   (2630.980)   (2630.682)   
raceBlack                        -18415.931 ***-15219.157 ***-10056.790 ***-9991.410 ***-10010.249 ***
                        (2494.993)   (2469.422)   (2120.425)   (2117.969)   (2117.751)   
raceOthers                        -19825.444 ***-17587.033 ***-11492.685 ***-11466.127 ***-11474.295 ***
                        (1862.273)   (1842.573)   (1582.357)   (1580.617)   (1580.442)   
raceWhite                        -23129.046 ***-18850.259 ***-11922.140 ***-11797.448 ***-11823.976 ***
                        (1543.285)   (1543.007)   (1327.160)   (1325.828)   (1325.751)   
pop_estimate_2019_m                                496.117 ***414.710 ***413.930 ***413.058 ***
                                (28.559)   (24.571)   (24.546)   (24.547)   
researchTRUE                                        -1355.889    -1207.748    -875.806    
                                        (1799.811)   (1797.860)   (1806.191)   
managerTRUE                                        8292.792 ***8229.515 ***8232.859 ***
                                        (719.191)   (718.516)   (718.436)   
engineerTRUE                                        22900.445 ***22672.079 ***22686.156 ***
                                        (1090.997)   (1090.502)   (1090.403)   
industryBusiness or Consulting                                        14160.591 ***14381.124 ***14444.830 ***
                                        (1737.439)   (1735.976)   (1736.104)   
industryComputing or Tech                                        19739.674 ***19520.670 ***19519.668 ***
                                        (1217.430)   (1217.248)   (1217.110)   
industryEducation (Higher Education)                                        -24655.285 ***-24421.383 ***-24394.782 ***
                                        (1321.490)   (1321.082)   (1321.006)   
industryEducation (Primary/Secondary)                                        -27573.822 ***-27838.305 ***-27799.251 ***
                                        (1690.342)   (1690.152)   (1690.085)   
industryEngineering or Manufacturing                                        -3540.355 *  -3336.457 *  -3256.983 *  
                                        (1490.069)   (1488.809)   (1489.232)   
industryGovernment and Public Administration                                        -6801.635 ***-6551.443 ***-6437.312 ***
                                        (1402.622)   (1401.703)   (1402.840)   
industryHealth care                                        -4316.280 ** -4243.986 ** -4164.012 ** 
                                        (1354.116)   (1352.599)   (1353.105)   
industryInsurance                                        -3274.070    -3313.004    -3312.386    
                                        (1986.342)   (1984.418)   (1984.192)   
industryLaw                                        -5477.892 ** -5400.347 ** -5376.811 ** 
                                        (1785.729)   (1783.907)   (1783.747)   
industryMarketing, Advertising & PR                                        450.697    378.725    363.498    
                                        (1561.097)   (1559.309)   (1559.152)   
industryMedia & Digital                                        -4514.505 *  -4483.965 *  -4457.684 *  
                                        (1775.735)   (1773.998)   (1773.850)   
industryNonprofits                                        -15804.795 ***-15631.462 ***-15611.513 ***
                                        (1270.969)   (1269.925)   (1269.824)   
industryRecruitment or HR                                        -2879.708    -3112.659    -3128.515    
                                        (2063.699)   (2061.669)   (2061.451)   
industryRetail                                        -20691.092 ***-20791.356 ***-20777.168 ***
                                        (2110.220)   (2108.144)   (2107.917)   
data_relatedTRUE                                        7365.560 ***7170.066 ***5791.150 ** 
                                        (1884.875)   (1883.043)   (2018.888)   
scientistTRUE                                        12911.077 ***12713.710 ***9426.006 ** 
                                        (2605.884)   (2603.049)   (3129.243)   
databaseTRUE                                        -16577.967 *  -16550.496 *  -15181.333 *  
                                        (6445.897)   (6438.143)   (6477.934)   
librarianTRUE                                        -11263.970 ***-11402.890 ***-11437.726 ***
                                        (2118.660)   (2116.375)   (2116.214)   
add_context_jobTRUE                                                -2928.628 ***-2916.277 ***
                                                (623.222)   (623.185)   
add_context_incTRUE                                                2995.721 ** 2993.794 ** 
                                                (919.047)   (918.943)   
data_relatedTRUE:scientistTRUE                                                        10379.524    
                                                        (5484.449)   
#observations11366        11366        11366        11366        11366        11366        11366        11366        
R squared0.128    0.133    0.170    0.191    0.212    0.423    0.425    0.425    
Adj. R Squared0.127    0.131    0.169    0.190    0.211    0.421    0.422    0.423    
Residual SE35289.480    35207.617    34435.944    34006.685    33564.555    28743.282    28708.695    28705.422    
AIC270300.812    270255.011    269749.237    269474.075    269177.590    265673.551    265648.174    265646.577    
*** p < 0.001; ** p < 0.01; * p < 0.05.
# calculate rmse on test set of 11 models
models <- list(m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11)
rmses <- c()
i <- 0
for (model in models){
  i = i + 1
  print(paste("model",i))
  preds <- predict(model, data_test)
  rmses[i] <- RMSE(
    pred = preds,
    obs = data_test$salary
    )
  print(paste("RMSE on train data:",summary(model)$sigma))
  print(paste("RMSE on test data:", rmses[i]))
  
  SS.total      <- with(data_test,sum((salary-mean(salary))^2))
  #SS.residual   <- sum(residuals(model)^2)
  SS.regression <- sum((preds-mean(data_test$salary))^2)
  #SS.total - (SS.regression+SS.residual)
  
  print(paste("R Squared on train data", summary(model)$r.squared))     
  print(paste("R Squared on test data", SS.regression/SS.total))  # fraction of variation explained by the model
  
}
## [1] "model 1"
## [1] "RMSE on train data: 37095.4502501071"
## [1] "RMSE on test data: 36985.4136059742"
## [1] "R Squared on train data 0.0362110702473326"
## [1] "R Squared on test data 0.0373349296083914"
## [1] "model 2"
## [1] "RMSE on train data: 36898.5318250807"
## [1] "RMSE on test data: 36597.9066711955"
## [1] "R Squared on train data 0.0465841987199754"
## [1] "R Squared on test data 0.0476371751180191"
## [1] "model 3"
## [1] "RMSE on train data: 36049.8427580782"
## [1] "RMSE on test data: 35753.5340710996"
## [1] "R Squared on train data 0.089938121540009"
## [1] "R Squared on test data 0.0926641460655427"
## [1] "model 4"
## [1] "RMSE on train data: 35289.4799778895"
## [1] "RMSE on test data: 34924.6767727815"
## [1] "R Squared on train data 0.128000066311433"
## [1] "R Squared on test data 0.132199529165854"
## [1] "model 5"
## [1] "RMSE on train data: 35207.617443797"
## [1] "RMSE on test data: 34830.0983571456"
## [1] "R Squared on train data 0.132575981039106"
## [1] "R Squared on test data 0.137322501017178"
## [1] "model 6"
## [1] "RMSE on train data: 34435.9443290408"
## [1] "RMSE on test data: 34127.075354642"
## [1] "R Squared on train data 0.170037088957289"
## [1] "R Squared on test data 0.17746586804058"
## [1] "model 7"
## [1] "RMSE on train data: 34006.6853010122"
## [1] "RMSE on test data: 33893.6113771057"
## [1] "R Squared on train data 0.191312823275399"
## [1] "R Squared on test data 0.200353434956173"
## [1] "model 8"
## [1] "RMSE on train data: 33564.5551861195"
## [1] "RMSE on test data: 33554.0911706585"
## [1] "R Squared on train data 0.212273508211821"
## [1] "R Squared on test data 0.22268113081223"
## [1] "model 9"
## [1] "RMSE on train data: 28743.2816284638"
## [1] "RMSE on test data: 28711.9924589569"
## [1] "R Squared on train data 0.423390974628695"
## [1] "R Squared on test data 0.413260520145657"
## [1] "model 10"
## [1] "RMSE on train data: 28708.6953277177"
## [1] "RMSE on test data: 28676.5016877011"
## [1] "R Squared on train data 0.424879416748037"
## [1] "R Squared on test data 0.415327383057971"
## [1] "model 11"
## [1] "RMSE on train data: 28705.4215821016"
## [1] "RMSE on test data: 28649.524717651"
## [1] "R Squared on train data 0.425061378093912"
## [1] "R Squared on test data 0.415699021430563"
# for model 10, comparing to 28657.84 RMSE in training data. 
# the RMSE 29507 is not very large indicating there is not very heavy over fitting problem